{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Regressão Linear Simples - Trabalho\n",
    "### Aluno: Joel Ribeiro ~ 371822\n",
    "------"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Bibliotecas\n",
    "%matplotlib inline \n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import random\n",
    "from pandas import DataFrame\n",
    "import math \n",
    "from math import sqrt\n",
    "from sklearn.model_selection import train_test_split"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Adicionando algumas funções acessórias a regressão linear"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#função para carregar o arquivo\n",
    "def load_CSV(file):\n",
    "    df = pd.read_csv(file)\n",
    "    return df\n",
    "\n",
    "#função para converter um conjunto de dados carregado para um array de números float\n",
    "def str_column_to_float (df):\n",
    "    df=df.values\n",
    "    return np.asarray(df)\n",
    "\n",
    "#função para avaliar um algoritmo usando um conjunto de treino(60%) e teste(40%)\n",
    "def split_train_split (dataset,splitRatio):\n",
    "    trainSize = int(len(dataset) * splitRatio)\n",
    "    trainSet = []\n",
    "    copy = list(dataset)\n",
    "    while len(trainSet) < trainSize:\n",
    "        index = random.randrange(len(copy))\n",
    "        trainSet.append(copy.pop(index))\n",
    "    testSet = copy\n",
    "    return [trainSet, testSet]\n",
    "\n",
    "# Calculate coefficients\n",
    "def coefficients(dataset):\n",
    "    x = [row[0] for row in dataset]\n",
    "    y = [row[1] for row in dataset]\n",
    "    x_mean, y_mean = mean(x), mean(y)\n",
    "    b1 = covariance(x, x_mean, y, y_mean) / variance(x, x_mean)\n",
    "    b0 = y_mean - b1 * x_mean\n",
    "    return [b0, b1]\n",
    "\n",
    "def simple_linear_regression(train, test):\n",
    "    predictions = list()\n",
    "    b0, b1 = coefficients(train)\n",
    "    for row in test:\n",
    "        ypred = b0 + b1 * row[0]\n",
    "        predictions.append(ypred)\n",
    "    return predictions\n",
    "\n",
    "# função para calcular RMSE (Root Mean Squared error)\n",
    "def rmse_metric(actual, predicted):\n",
    "    sum_error = 0.0\n",
    "    for i in range(len(actual)):\n",
    "        prediction_error = predicted[i] - actual[i]\n",
    "        sum_error += (prediction_error ** 2)\n",
    "    mean_error = sum_error / float(len(actual))\n",
    "    return sqrt(mean_error)\n",
    "\n",
    "# função para avaliar um algoritmo\n",
    "def evaluate_algorithm(dataset, algorithm):\n",
    "    test_set = list()\n",
    "    for row in dataset:\n",
    "        row_copy = list(row)\n",
    "        row_copy[-1] = None\n",
    "        test_set.append(row_copy)\n",
    "    predicted = algorithm(dataset, test_set)\n",
    "    print(predicted)\n",
    "    actual = [row[-1] for row in dataset]\n",
    "    rmse = rmse_metric(actual, predicted)\n",
    "    return rmse\n",
    "\n",
    "##funções matemáticas que deverão ser utilizadas nas funções definidas anteriormente\n",
    "\n",
    "# Calculate the mean value of a list of numbers\n",
    "def mean(values):\n",
    "    return sum(values) / float(len(values))\n",
    "\n",
    "# Calculate the variance of a list of numbers\n",
    "def variance(values, mean):\n",
    "    return sum([(x-mean)**2 for x in values])\n",
    "\n",
    "# Calculate covariance between x and y\n",
    "def covariance(x, mean_x, y, mean_y):\n",
    "    covar = 0.0\n",
    "    for i in range(len(x)):\n",
    "        covar += (x[i] - mean_x) * (y[i] - mean_y)\n",
    "    return covar"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "    108  392.5\n",
      "0    19   46.2\n",
      "1    13   15.7\n",
      "2   124  422.2\n",
      "3    40  119.4\n",
      "4    57  170.9\n",
      "5    23   56.9\n",
      "6    14   77.5\n",
      "7    45  214.0\n",
      "8    10   65.3\n",
      "9     5   20.9\n",
      "10   48  248.1\n",
      "11   11   23.5\n",
      "12   23   39.6\n",
      "13    7   48.8\n",
      "14    2    6.6\n",
      "15   24  134.9\n",
      "16    6   50.9\n",
      "17    3    4.4\n",
      "18   23  113.0\n",
      "19    6   14.8\n",
      "20    9   48.7\n",
      "21    9   52.1\n",
      "22    3   13.2\n",
      "23   29  103.9\n",
      "24    7   77.5\n",
      "25    4   11.8\n",
      "26   20   98.1\n",
      "27    7   27.9\n",
      "28    4   38.1\n",
      "29    0    0.0\n",
      "..  ...    ...\n",
      "32    5   40.3\n",
      "33   22  161.5\n",
      "34   11   57.2\n",
      "35   61  217.6\n",
      "36   12   58.1\n",
      "37    4   12.6\n",
      "38   16   59.6\n",
      "39   13   89.9\n",
      "40   60  202.4\n",
      "41   41  181.3\n",
      "42   37  152.8\n",
      "43   55  162.8\n",
      "44   41   73.4\n",
      "45   11   21.3\n",
      "46   27   92.6\n",
      "47    8   76.1\n",
      "48    3   39.9\n",
      "49   17  142.1\n",
      "50   13   93.0\n",
      "51   13   31.9\n",
      "52   15   32.1\n",
      "53    8   55.6\n",
      "54   29  133.3\n",
      "55   30  194.5\n",
      "56   24  137.9\n",
      "57    9   87.4\n",
      "58   31  209.8\n",
      "59   14   95.5\n",
      "60   53  244.6\n",
      "61   26  187.5\n",
      "\n",
      "[62 rows x 2 columns]\n"
     ]
    }
   ],
   "source": [
    "df=load_CSV('insurance.csv')\n",
    "print(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[  19.    46.2]\n",
      " [  13.    15.7]\n",
      " [ 124.   422.2]\n",
      " [  40.   119.4]\n",
      " [  57.   170.9]\n",
      " [  23.    56.9]\n",
      " [  14.    77.5]\n",
      " [  45.   214. ]\n",
      " [  10.    65.3]\n",
      " [   5.    20.9]\n",
      " [  48.   248.1]\n",
      " [  11.    23.5]\n",
      " [  23.    39.6]\n",
      " [   7.    48.8]\n",
      " [   2.     6.6]\n",
      " [  24.   134.9]\n",
      " [   6.    50.9]\n",
      " [   3.     4.4]\n",
      " [  23.   113. ]\n",
      " [   6.    14.8]\n",
      " [   9.    48.7]\n",
      " [   9.    52.1]\n",
      " [   3.    13.2]\n",
      " [  29.   103.9]\n",
      " [   7.    77.5]\n",
      " [   4.    11.8]\n",
      " [  20.    98.1]\n",
      " [   7.    27.9]\n",
      " [   4.    38.1]\n",
      " [   0.     0. ]\n",
      " [  25.    69.2]\n",
      " [   6.    14.6]\n",
      " [   5.    40.3]\n",
      " [  22.   161.5]\n",
      " [  11.    57.2]\n",
      " [  61.   217.6]\n",
      " [  12.    58.1]\n",
      " [   4.    12.6]\n",
      " [  16.    59.6]\n",
      " [  13.    89.9]\n",
      " [  60.   202.4]\n",
      " [  41.   181.3]\n",
      " [  37.   152.8]\n",
      " [  55.   162.8]\n",
      " [  41.    73.4]\n",
      " [  11.    21.3]\n",
      " [  27.    92.6]\n",
      " [   8.    76.1]\n",
      " [   3.    39.9]\n",
      " [  17.   142.1]\n",
      " [  13.    93. ]\n",
      " [  13.    31.9]\n",
      " [  15.    32.1]\n",
      " [   8.    55.6]\n",
      " [  29.   133.3]\n",
      " [  30.   194.5]\n",
      " [  24.   137.9]\n",
      " [   9.    87.4]\n",
      " [  31.   209.8]\n",
      " [  14.    95.5]\n",
      " [  53.   244.6]\n",
      " [  26.   187.5]]\n"
     ]
    }
   ],
   "source": [
    "df = str_column_to_float(df)\n",
    "print(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[array([ 25. ,  69.2]), array([  17. ,  142.1]), array([  8. ,  76.1]), array([ 13. ,  31.9]), array([ 27. ,  92.6]), array([ 11. ,  23.5]), array([  37. ,  152.8]), array([ 19. ,  46.2]), array([  7. ,  77.5]), array([  4. ,  11.8]), array([  9. ,  87.4]), array([  24. ,  137.9]), array([  31. ,  209.8]), array([  23.,  113.]), array([  7. ,  27.9]), array([ 14. ,  95.5]), array([ 13. ,  89.9]), array([ 11. ,  57.2]), array([  6. ,  14.6]), array([ 20. ,  98.1]), array([  3. ,  39.9]), array([  7. ,  48.8]), array([  55. ,  162.8]), array([ 15. ,  32.1]), array([ 3. ,  4.4]), array([ 11. ,  21.3]), array([ 13.,  93.]), array([  57. ,  170.9]), array([ 124. ,  422.2]), array([  4. ,  38.1]), array([ 41. ,  73.4]), array([ 14. ,  77.5]), array([  4. ,  12.6]), array([  8. ,  55.6]), array([  61. ,  217.6]), array([  53. ,  244.6]), array([  29. ,  103.9])]\n",
      "---------------\n",
      "[array([ 13. ,  15.7]), array([  40. ,  119.4]), array([ 23. ,  56.9]), array([  45.,  214.]), array([ 10. ,  65.3]), array([  5. ,  20.9]), array([  48. ,  248.1]), array([ 23. ,  39.6]), array([ 2. ,  6.6]), array([  24. ,  134.9]), array([  6. ,  50.9]), array([  6. ,  14.8]), array([  9. ,  48.7]), array([  9. ,  52.1]), array([  3. ,  13.2]), array([ 0.,  0.]), array([  5. ,  40.3]), array([  22. ,  161.5]), array([ 12. ,  58.1]), array([ 16. ,  59.6]), array([  60. ,  202.4]), array([  41. ,  181.3]), array([  29. ,  133.3]), array([  30. ,  194.5]), array([  26. ,  187.5])]\n"
     ]
    }
   ],
   "source": [
    "trainSet,testSet= split_train_split(df,0.6)\n",
    "print(trainSet)\n",
    "print('---------------')\n",
    "print(testSet)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[  15.7  119.4   56.9  214.    65.3   20.9  248.1   39.6    6.6  134.9\n",
      "   50.9   14.8   48.7   52.1   13.2    0.    40.3  161.5   58.1   59.6\n",
      "  202.4  181.3  133.3  194.5  187.5]\n",
      "[64.121814387362917, 149.80540927792327, 95.856479161644529, 165.67274166506411, 54.601414955078425, 38.734082567937612, 175.19314109734859, 95.856479161644529, 29.213683135653124, 99.029945639072693, 41.907549045365769, 41.907549045365769, 51.427948477650261, 51.427948477650261, 32.387149613081284, 22.866750180796799, 38.734082567937612, 92.683012684216365, 60.948347909934746, 73.642213819647395, 213.27473882648653, 152.97887575535145, 114.8972780262135, 118.07074450364166, 105.37687859392902]\n"
     ]
    }
   ],
   "source": [
    "predictions = simple_linear_regression(trainSet,testSet)\n",
    "print(np.asarray(testSet)[:,1])\n",
    "print(predictions)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "39.1590756676614\n"
     ]
    }
   ],
   "source": [
    "print(rmse_metric(np.asarray(testSet)[:,1], predictions))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
